{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "# import seaborn as sns\n",
    "# import matplotlib.pyplot as plt\n",
    "import os\n",
    "# plt.style.use('seaborn-colorblind')\n",
    "# %matplotlib inline\n",
    "from feature_cleaning import outlier as ot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(891, 6)\n"
     ]
    }
   ],
   "source": [
    "use_cols = [\n",
    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
    "    'Survived'\n",
    "]\n",
    "\n",
    "\n",
    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
    "data.head(3)\n",
    "print(data.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "104      0.0000\n",
       "163      4.0125\n",
       "245      5.0000\n",
       "152      6.2375\n",
       "240      6.4375\n",
       "237      6.4500\n",
       "110      6.4958\n",
       "85       6.7500\n",
       "171      6.8583\n",
       "238      6.9500\n",
       "78       6.9750\n",
       "185      7.0458\n",
       "79       7.0500\n",
       "218      7.0542\n",
       "123      7.1250\n",
       "76       7.1417\n",
       "18       7.2250\n",
       "32       7.2292\n",
       "0        7.2500\n",
       "91       7.3125\n",
       "196      7.4958\n",
       "186      7.5208\n",
       "120      7.5500\n",
       "192      7.6292\n",
       "55       7.6500\n",
       "182      7.7250\n",
       "166      7.7292\n",
       "93       7.7333\n",
       "229      7.7375\n",
       "227      7.7417\n",
       "         ...   \n",
       "46      80.0000\n",
       "177     81.8583\n",
       "30      82.1708\n",
       "147     83.1583\n",
       "47      83.4750\n",
       "126     86.5000\n",
       "180     89.1042\n",
       "117     90.0000\n",
       "136     91.0792\n",
       "198     93.5000\n",
       "200    106.4250\n",
       "144    108.9000\n",
       "143    110.8833\n",
       "114    113.2750\n",
       "168    120.0000\n",
       "155    133.6500\n",
       "151    134.5000\n",
       "130    135.6333\n",
       "27     146.5208\n",
       "139    151.5500\n",
       "129    153.4625\n",
       "150    164.8667\n",
       "224    211.3375\n",
       "162    211.5000\n",
       "199    221.7792\n",
       "164    227.5250\n",
       "75     247.5208\n",
       "148    262.3750\n",
       "23     263.0000\n",
       "127    512.3292\n",
       "Length: 248, dtype: float64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(data.Fare.unique()).sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Detect by arbitrary boundary\n",
    "identify outliers based on arbitrary boundaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num of outlier detected: 19\n",
      "Proportion of outlier detected 0.02132435465768799\n",
      "Upper bound: 300 \n",
      "Lower bound: 5\n"
     ]
    }
   ],
   "source": [
    "index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)\n",
    "print('Upper bound:',para[0],'\\nLower bound:',para[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "179      0.0000\n",
       "806      0.0000\n",
       "732      0.0000\n",
       "674      0.0000\n",
       "633      0.0000\n",
       "597      0.0000\n",
       "815      0.0000\n",
       "466      0.0000\n",
       "481      0.0000\n",
       "302      0.0000\n",
       "277      0.0000\n",
       "271      0.0000\n",
       "263      0.0000\n",
       "413      0.0000\n",
       "822      0.0000\n",
       "378      4.0125\n",
       "679    512.3292\n",
       "737    512.3292\n",
       "258    512.3292\n",
       "Name: Fare, dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check the 19 found outliers\n",
    "data.loc[index,'Fare'].sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## IQR method\n",
    "outlier detection by Interquartile Ranges Rule"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num of outlier detected: 31\n",
      "Proportion of outlier detected 0.03479236812570146\n",
      "Upper bound: 146.448 \n",
      "Lower bound: -107.53760000000001\n"
     ]
    }
   ],
   "source": [
    "index,para = ot.outlier_detect_IQR(data=data,col='Fare',threshold=5)\n",
    "print('Upper bound:',para[0],'\\nLower bound:',para[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31     146.5208\n",
       "195    146.5208\n",
       "305    151.5500\n",
       "708    151.5500\n",
       "297    151.5500\n",
       "498    151.5500\n",
       "609    153.4625\n",
       "332    153.4625\n",
       "268    153.4625\n",
       "318    164.8667\n",
       "856    164.8667\n",
       "730    211.3375\n",
       "779    211.3375\n",
       "689    211.3375\n",
       "377    211.5000\n",
       "527    221.7792\n",
       "700    227.5250\n",
       "716    227.5250\n",
       "557    227.5250\n",
       "380    227.5250\n",
       "299    247.5208\n",
       "118    247.5208\n",
       "311    262.3750\n",
       "742    262.3750\n",
       "341    263.0000\n",
       "88     263.0000\n",
       "438    263.0000\n",
       "27     263.0000\n",
       "679    512.3292\n",
       "258    512.3292\n",
       "737    512.3292\n",
       "Name: Fare, dtype: float64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check the 31 found outliers\n",
    "data.loc[index,'Fare'].sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mean and Standard Deviation Method\n",
    "outlier detection by Mean and Standard Deviation Method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num of outlier detected: 20\n",
      "Proportion of outlier detected 0.02244668911335578\n",
      "Upper bound: 181.2844937601173 \n",
      "Lower bound: -116.87607782296811\n"
     ]
    }
   ],
   "source": [
    "index,para = ot.outlier_detect_mean_std(data=data,col='Fare',threshold=3)\n",
    "print('Upper bound:',para[0],'\\nLower bound:',para[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "779    211.3375\n",
       "730    211.3375\n",
       "689    211.3375\n",
       "377    211.5000\n",
       "527    221.7792\n",
       "716    227.5250\n",
       "700    227.5250\n",
       "380    227.5250\n",
       "557    227.5250\n",
       "118    247.5208\n",
       "299    247.5208\n",
       "311    262.3750\n",
       "742    262.3750\n",
       "27     263.0000\n",
       "341    263.0000\n",
       "88     263.0000\n",
       "438    263.0000\n",
       "258    512.3292\n",
       "737    512.3292\n",
       "679    512.3292\n",
       "Name: Fare, dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check the 20 found outliers\n",
    "data.loc[index,'Fare'].sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MAD method\n",
    "outlier detection by Median and Median Absolute Deviation Method (MAD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num of outlier detected: 160\n",
      "Proportion of outlier detected 0.17957351290684623\n"
     ]
    }
   ],
   "source": [
    "# too aggressive for our dataset, about 18% of cases are detected as outliers.\n",
    "index = ot.outlier_detect_MAD(data=data,col='Fare',threshold=3.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Imputation with arbitrary value\n",
    "impute outliers with arbitrary value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num of outlier detected: 19\n",
      "Proportion of outlier detected 0.02132435465768799\n",
      "Upper bound: 300 \n",
      "Lower bound: 5\n"
     ]
    }
   ],
   "source": [
    "# use any of the detection method above\n",
    "index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)\n",
    "print('Upper bound:',para[0],'\\nLower bound:',para[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>255</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0</td>\n",
       "      <td>15.2458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>256</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>79.2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>257</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>30.0</td>\n",
       "      <td>0</td>\n",
       "      <td>86.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>258</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>512.3292</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>259</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>50.0</td>\n",
       "      <td>0</td>\n",
       "      <td>26.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>260</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>261</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4</td>\n",
       "      <td>31.3875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1</td>\n",
       "      <td>79.6500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>263</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>264</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>265</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>36.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>266</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>16.0</td>\n",
       "      <td>4</td>\n",
       "      <td>39.6875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7.7750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>58.0</td>\n",
       "      <td>0</td>\n",
       "      <td>153.4625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>135.6333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>31.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>41.0</td>\n",
       "      <td>0</td>\n",
       "      <td>19.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>37.0</td>\n",
       "      <td>0</td>\n",
       "      <td>29.7000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>274</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Survived  Pclass     Sex   Age  SibSp      Fare\n",
       "255         1       3  female  29.0      0   15.2458\n",
       "256         1       1  female   NaN      0   79.2000\n",
       "257         1       1  female  30.0      0   86.5000\n",
       "258         1       1  female  35.0      0  512.3292\n",
       "259         1       2  female  50.0      0   26.0000\n",
       "260         0       3    male   NaN      0    7.7500\n",
       "261         1       3    male   3.0      4   31.3875\n",
       "262         0       1    male  52.0      1   79.6500\n",
       "263         0       1    male  40.0      0    0.0000\n",
       "264         0       3  female   NaN      0    7.7500\n",
       "265         0       2    male  36.0      0   10.5000\n",
       "266         0       3    male  16.0      4   39.6875\n",
       "267         1       3    male  25.0      1    7.7750\n",
       "268         1       1  female  58.0      0  153.4625\n",
       "269         1       1  female  35.0      0  135.6333\n",
       "270         0       1    male   NaN      0   31.0000\n",
       "271         1       3    male  25.0      0    0.0000\n",
       "272         1       2  female  41.0      0   19.5000\n",
       "273         0       1    male  37.0      0   29.7000\n",
       "274         1       3  female   NaN      0    7.7500"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[255:275]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>255</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0</td>\n",
       "      <td>15.2458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>256</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>79.2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>257</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>30.0</td>\n",
       "      <td>0</td>\n",
       "      <td>86.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>258</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>-999.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>259</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>50.0</td>\n",
       "      <td>0</td>\n",
       "      <td>26.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>260</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>261</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4</td>\n",
       "      <td>31.3875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1</td>\n",
       "      <td>79.6500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>263</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0</td>\n",
       "      <td>-999.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>264</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>265</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>36.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>266</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>16.0</td>\n",
       "      <td>4</td>\n",
       "      <td>39.6875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7.7750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>58.0</td>\n",
       "      <td>0</td>\n",
       "      <td>153.4625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>135.6333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>31.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0</td>\n",
       "      <td>-999.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>41.0</td>\n",
       "      <td>0</td>\n",
       "      <td>19.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>37.0</td>\n",
       "      <td>0</td>\n",
       "      <td>29.7000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>274</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Survived  Pclass     Sex   Age  SibSp      Fare\n",
       "255         1       3  female  29.0      0   15.2458\n",
       "256         1       1  female   NaN      0   79.2000\n",
       "257         1       1  female  30.0      0   86.5000\n",
       "258         1       1  female  35.0      0 -999.0000\n",
       "259         1       2  female  50.0      0   26.0000\n",
       "260         0       3    male   NaN      0    7.7500\n",
       "261         1       3    male   3.0      4   31.3875\n",
       "262         0       1    male  52.0      1   79.6500\n",
       "263         0       1    male  40.0      0 -999.0000\n",
       "264         0       3  female   NaN      0    7.7500\n",
       "265         0       2    male  36.0      0   10.5000\n",
       "266         0       3    male  16.0      4   39.6875\n",
       "267         1       3    male  25.0      1    7.7750\n",
       "268         1       1  female  58.0      0  153.4625\n",
       "269         1       1  female  35.0      0  135.6333\n",
       "270         0       1    male   NaN      0   31.0000\n",
       "271         1       3    male  25.0      0 -999.0000\n",
       "272         1       2  female  41.0      0   19.5000\n",
       "273         0       1    male  37.0      0   29.7000\n",
       "274         1       3  female   NaN      0    7.7500"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# see index 258,263,271 have been replaced\n",
    "data2 = ot.impute_outlier_with_arbitrary(data=data,outlier_index=index,\n",
    "                                         value=-999,col=['Fare'])\n",
    "data2[255:275]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Windsorization\n",
    "top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num of outlier detected: 19\n",
      "Proportion of outlier detected 0.02132435465768799\n",
      "Upper bound: 300 \n",
      "Lower bound: 5\n"
     ]
    }
   ],
   "source": [
    "# use any of the detection method above\n",
    "index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
    "print('Upper bound:',para[0],'\\nLower bound:',para[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>255</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0</td>\n",
       "      <td>15.2458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>256</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>79.2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>257</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>30.0</td>\n",
       "      <td>0</td>\n",
       "      <td>86.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>258</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>300.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>259</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>50.0</td>\n",
       "      <td>0</td>\n",
       "      <td>26.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>260</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>261</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4</td>\n",
       "      <td>31.3875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1</td>\n",
       "      <td>79.6500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>263</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0</td>\n",
       "      <td>5.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>264</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>265</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>36.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>266</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>16.0</td>\n",
       "      <td>4</td>\n",
       "      <td>39.6875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7.7750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>58.0</td>\n",
       "      <td>0</td>\n",
       "      <td>153.4625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>135.6333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>31.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0</td>\n",
       "      <td>5.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>41.0</td>\n",
       "      <td>0</td>\n",
       "      <td>19.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>37.0</td>\n",
       "      <td>0</td>\n",
       "      <td>29.7000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>274</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Survived  Pclass     Sex   Age  SibSp      Fare\n",
       "255         1       3  female  29.0      0   15.2458\n",
       "256         1       1  female   NaN      0   79.2000\n",
       "257         1       1  female  30.0      0   86.5000\n",
       "258         1       1  female  35.0      0  300.0000\n",
       "259         1       2  female  50.0      0   26.0000\n",
       "260         0       3    male   NaN      0    7.7500\n",
       "261         1       3    male   3.0      4   31.3875\n",
       "262         0       1    male  52.0      1   79.6500\n",
       "263         0       1    male  40.0      0    5.0000\n",
       "264         0       3  female   NaN      0    7.7500\n",
       "265         0       2    male  36.0      0   10.5000\n",
       "266         0       3    male  16.0      4   39.6875\n",
       "267         1       3    male  25.0      1    7.7750\n",
       "268         1       1  female  58.0      0  153.4625\n",
       "269         1       1  female  35.0      0  135.6333\n",
       "270         0       1    male   NaN      0   31.0000\n",
       "271         1       3    male  25.0      0    5.0000\n",
       "272         1       2  female  41.0      0   19.5000\n",
       "273         0       1    male  37.0      0   29.7000\n",
       "274         1       3  female   NaN      0    7.7500"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# see index 258,263,271 have been replaced with top/bottom coding\n",
    "\n",
    "data3 = ot.windsorization(data=data,col='Fare',para=para,strategy='both')\n",
    "data3[255:275]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Discard outliers\n",
    "Drop the cases that are outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num of outlier detected: 19\n",
      "Proportion of outlier detected 0.02132435465768799\n",
      "Upper bound: 300 \n",
      "Lower bound: 5\n"
     ]
    }
   ],
   "source": [
    "# use any of the detection method above\n",
    "index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
    "print('Upper bound:',para[0],'\\nLower bound:',para[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "263.0\n",
      "5.0\n"
     ]
    }
   ],
   "source": [
    "# drop the outlier.\n",
    "# we can see no more observations have value >300 or <5. They've been removed.\n",
    "data4 = ot.drop_outlier(data=data,outlier_index=index)\n",
    "print(data4.Fare.max())\n",
    "print(data4.Fare.min())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mean/Median/Mode Imputation\n",
    "replacing the outlier by mean/median/most frequent values of that variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num of outlier detected: 19\n",
      "Proportion of outlier detected 0.02132435465768799\n",
      "Upper bound: 300 \n",
      "Lower bound: 5\n"
     ]
    }
   ],
   "source": [
    "# use any of the detection method above\n",
    "index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
    "print('Upper bound:',para[0],'\\nLower bound:',para[1])\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>255</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0</td>\n",
       "      <td>15.245800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>256</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>79.200000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>257</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>30.0</td>\n",
       "      <td>0</td>\n",
       "      <td>86.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>258</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>32.204208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>259</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>50.0</td>\n",
       "      <td>0</td>\n",
       "      <td>26.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>260</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.750000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>261</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4</td>\n",
       "      <td>31.387500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1</td>\n",
       "      <td>79.650000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>263</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0</td>\n",
       "      <td>32.204208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>264</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.750000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>265</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>36.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>266</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>16.0</td>\n",
       "      <td>4</td>\n",
       "      <td>39.687500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7.775000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>268</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>58.0</td>\n",
       "      <td>0</td>\n",
       "      <td>153.462500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>135.633300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>31.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0</td>\n",
       "      <td>32.204208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>41.0</td>\n",
       "      <td>0</td>\n",
       "      <td>19.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>37.0</td>\n",
       "      <td>0</td>\n",
       "      <td>29.700000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>274</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>7.750000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Survived  Pclass     Sex   Age  SibSp        Fare\n",
       "255         1       3  female  29.0      0   15.245800\n",
       "256         1       1  female   NaN      0   79.200000\n",
       "257         1       1  female  30.0      0   86.500000\n",
       "258         1       1  female  35.0      0   32.204208\n",
       "259         1       2  female  50.0      0   26.000000\n",
       "260         0       3    male   NaN      0    7.750000\n",
       "261         1       3    male   3.0      4   31.387500\n",
       "262         0       1    male  52.0      1   79.650000\n",
       "263         0       1    male  40.0      0   32.204208\n",
       "264         0       3  female   NaN      0    7.750000\n",
       "265         0       2    male  36.0      0   10.500000\n",
       "266         0       3    male  16.0      4   39.687500\n",
       "267         1       3    male  25.0      1    7.775000\n",
       "268         1       1  female  58.0      0  153.462500\n",
       "269         1       1  female  35.0      0  135.633300\n",
       "270         0       1    male   NaN      0   31.000000\n",
       "271         1       3    male  25.0      0   32.204208\n",
       "272         1       2  female  41.0      0   19.500000\n",
       "273         0       1    male  37.0      0   29.700000\n",
       "274         1       3  female   NaN      0    7.750000"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# see index 258,263,271 have been replaced with mean\n",
    "\n",
    "data5 = ot.impute_outlier_with_avg(data=data,col='Fare',\n",
    "                                   outlier_index=index,strategy='mean')\n",
    "data5[255:275]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
